|
1
|
|
|
const crawler_file_tester = { |
|
2
|
|
|
|
|
3
|
|
|
robot_rules: [], |
|
4
|
|
|
|
|
5
|
|
|
/** |
|
6
|
|
|
* Parse the content of the robots file |
|
7
|
|
|
* |
|
8
|
|
|
* @param {*} result |
|
9
|
|
|
* @throws {Exception} |
|
10
|
|
|
*/ |
|
11
|
|
|
parse_robots_file: function(result){ |
|
12
|
|
|
var rules = result.split("\n"); |
|
13
|
|
|
$('#robots-check').addClass('text-success').append('<span class="glyphicon glyphicon-ok-circle"> </span>'); |
|
14
|
|
|
|
|
15
|
|
|
var agent = '*'; |
|
16
|
|
|
for(var r in rules){ |
|
17
|
|
|
if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){ |
|
18
|
|
|
continue; |
|
19
|
|
|
}else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){ |
|
20
|
|
|
agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, ''); |
|
21
|
|
|
}else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){ |
|
22
|
|
|
var rule = |
|
23
|
|
|
'^'+rules[r] |
|
24
|
|
|
.replace(/disallow:/gi, '') // remove disallow |
|
25
|
|
|
.replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space |
|
26
|
|
|
.replace('?', '\\?') // escape query string start |
|
27
|
|
|
.replace('|', '\\|') // escape pipe |
|
28
|
|
|
.replace('/', '\\/') // escape slashes |
|
29
|
|
|
.replace(/^\^\^/g, '^') // If it already had a caret remove it |
|
30
|
|
|
.replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier |
|
31
|
|
|
crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] }); |
|
32
|
|
|
}else{ |
|
33
|
|
|
console.log(rules[r]); |
|
34
|
|
|
throw "Found a rule which we don't understand. Report it to the developer"; |
|
35
|
|
|
} |
|
36
|
|
|
} |
|
37
|
|
|
}, |
|
38
|
|
|
|
|
39
|
|
|
/** |
|
40
|
|
|
* Check all tested url and see if they are blocked by any rule in the robots file |
|
41
|
|
|
* |
|
42
|
|
|
* @returns {undefined} |
|
43
|
|
|
*/ |
|
44
|
|
|
test_blocked_pages: function(){ |
|
45
|
|
|
for(var t in crawler.tested){ |
|
46
|
|
|
var url = crawler.tested[t]; |
|
47
|
|
|
|
|
48
|
|
|
if( crawler.linked_from.hasOwnProperty(url) ) { |
|
49
|
|
|
for (var r in this.robot_rules) { |
|
50
|
|
|
var regex = new RegExp(this.robot_rules[r]['rule'], 'g'); |
|
51
|
|
|
if (regex.test('/' + url)) { |
|
52
|
|
|
var link = crawler.painter.create_link(url, url), |
|
53
|
|
|
status = crawler.painter.create_status('error', 'Page has links and is blocked in robots'), |
|
54
|
|
|
agent = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent']; |
|
55
|
|
|
crawler.painter.add_row( |
|
56
|
|
|
'blocked_pages', |
|
57
|
|
|
[link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]); |
|
58
|
|
|
} |
|
59
|
|
|
} |
|
60
|
|
|
} |
|
61
|
|
|
} |
|
62
|
|
|
|
|
63
|
|
|
return undefined; |
|
64
|
|
|
}, |
|
65
|
|
|
|
|
66
|
|
|
/** |
|
67
|
|
|
* Setup an ajax call to fetch url |
|
68
|
|
|
* |
|
69
|
|
|
* @param {string} url |
|
70
|
|
|
* @param {function} callback |
|
71
|
|
|
* @param {function} failed_callback |
|
72
|
|
|
*/ |
|
73
|
|
|
get_file_contents: function(url, callback, failed_callback){ |
|
74
|
|
|
$.ajax({ |
|
75
|
|
|
'url': crawler.get_proxy('/seotest/getPage?u='+url+'&agent='+crawler.agent) |
|
76
|
|
|
}).done(callback).fail(failed_callback); |
|
77
|
|
|
} |
|
78
|
|
|
}; |
|
79
|
|
|
|
|
80
|
|
|
// Register the tests |
|
81
|
|
|
crawler.event_handler.on('BEFORE_INIT', function(){ |
|
82
|
|
|
crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false); |
|
83
|
|
|
crawler.painter.set_type('blocked_pages', 'default'); |
|
84
|
|
|
}); |
|
85
|
|
|
|
|
86
|
|
|
// Start up the file testers |
|
87
|
|
|
crawler.event_handler.on('AFTER_INIT', function(){ |
|
88
|
|
|
crawler_file_tester.get_file_contents( |
|
89
|
|
|
crawler.robots_url, |
|
90
|
|
|
crawler_file_tester.parse_robots_file, |
|
91
|
|
|
function(){ $('#robots-check').addClass('text-danger').append('<span class="glyphicon glyphicon-remove-circle"> </span>'); } |
|
92
|
|
|
); |
|
93
|
|
|
//crawler_file_tester.init_sitemap_tester(); |
|
94
|
|
|
}); |
|
95
|
|
|
|
|
96
|
|
|
// Test for blocked pages the the crawler finishes |
|
97
|
|
|
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){ |
|
98
|
|
|
crawler_file_tester.test_blocked_pages(); |
|
99
|
|
|
}); |
|
100
|
|
|
|
|
101
|
|
|
|